#
# Copyright (C) 2021 by Intel Corporation
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
# AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
#

	.intel_syntax noprefix

	.globl _expand_avx2
	.globl expand_avx2

	# void expand_avx2(int32_t *out, int32_t *in, size_t N);
	# On entry:
	#     rdi = a
	#     rsi = b
	#     rdx = N

	.text

_expand_avx2:
expand_avx2:

	push r13
	push r14
	push r15

					# mov rsi, input
					# mov rdi, output
	mov r9, rdx			# mov r9, len
	xor r8, r8
	xor r10, r10

	vpxor ymm0, ymm0, ymm0
	lea r14, shuf2[rip]		# mov r14, shuf2

mainloop:
	vmovdqa ymm1, [rsi+r8*4]
	vpxor ymm4, ymm4, ymm4
	vpcmpgtd ymm2, ymm1, ymm0
	vmovdqu ymm1, [rsi+r10*4]
	vmovmskps r13, ymm2
	shl r13, 5
	vmovdqa ymm3, [r14+r13]
	vpermd ymm4, ymm3, ymm1
	popcnt r13, r13
	add r10, r13
	vmaskmovps [rdi+r8*4], ymm2, ymm4
	add r8, 8
	cmp r8, r9
	jne mainloop

	pop r15
	pop r14
	pop r13

	vzeroupper
	ret

#ifdef __APPLE__
	.section __TEXT,__const
#else
	.section .rodata
#endif
	.p2align 5

shuf2:
	.int 0, 0, 0, 0, 0, 0, 0, 0
	.int 0, 0, 0, 0, 0, 0, 0, 0
	.int 0, 0, 0, 0, 0, 0, 0, 0
	.int 0, 1, 0, 0, 0, 0, 0, 0
	.int 0, 0, 0, 0, 0, 0, 0, 0
	.int 0, 0, 1, 0, 0, 0, 0, 0
	.int 0, 0, 1, 0, 0, 0, 0, 0
	.int 0, 1, 2, 0, 0, 0, 0, 0
	.int 0, 0, 0, 0, 0, 0, 0, 0
	.int 0, 0, 0, 1, 0, 0, 0, 0
	.int 0, 0, 0, 1, 0, 0, 0, 0
	.int 0, 1, 0, 2, 0, 0, 0, 0
	.int 0, 0, 0, 1, 0, 0, 0, 0
	.int 0, 0, 1, 2, 0, 0, 0, 0
	.int 0, 0, 1, 2, 0, 0, 0, 0
	.int 0, 1, 2, 3, 0, 0, 0, 0
	.int 0, 0, 0, 0, 0, 0, 0, 0
	.int 0, 0, 0, 0, 1, 0, 0, 0
	.int 0, 0, 0, 0, 1, 0, 0, 0
	.int 0, 1, 0, 0, 2, 0, 0, 0
	.int 0, 0, 0, 0, 1, 0, 0, 0
	.int 0, 0, 1, 0, 2, 0, 0, 0
	.int 0, 0, 1, 0, 2, 0, 0, 0
	.int 0, 1, 2, 0, 3, 0, 0, 0
	.int 0, 0, 0, 0, 1, 0, 0, 0
	.int 0, 0, 0, 1, 2, 0, 0, 0
	.int 0, 0, 0, 1, 2, 0, 0, 0
	.int 0, 1, 0, 2, 3, 0, 0, 0
	.int 0, 0, 0, 1, 2, 0, 0, 0
	.int 0, 0, 1, 2, 3, 0, 0, 0
	.int 0, 0, 1, 2, 3, 0, 0, 0
	.int 0, 1, 2, 3, 4, 0, 0, 0
	.int 0, 0, 0, 0, 0, 0, 0, 0
	.int 0, 0, 0, 0, 0, 1, 0, 0
	.int 0, 0, 0, 0, 0, 1, 0, 0
	.int 0, 1, 0, 0, 0, 2, 0, 0
	.int 0, 0, 0, 0, 0, 1, 0, 0
	.int 0, 0, 1, 0, 0, 2, 0, 0
	.int 0, 0, 1, 0, 0, 2, 0, 0
	.int 0, 1, 2, 0, 0, 3, 0, 0
	.int 0, 0, 0, 0, 0, 1, 0, 0
	.int 0, 0, 0, 1, 0, 2, 0, 0
	.int 0, 0, 0, 1, 0, 2, 0, 0
	.int 0, 1, 0, 2, 0, 3, 0, 0
	.int 0, 0, 0, 1, 0, 2, 0, 0
	.int 0, 0, 1, 2, 0, 3, 0, 0
	.int 0, 0, 1, 2, 0, 3, 0, 0
	.int 0, 1, 2, 3, 0, 4, 0, 0
	.int 0, 0, 0, 0, 0, 1, 0, 0
	.int 0, 0, 0, 0, 1, 2, 0, 0
	.int 0, 0, 0, 0, 1, 2, 0, 0
	.int 0, 1, 0, 0, 2, 3, 0, 0
	.int 0, 0, 0, 0, 1, 2, 0, 0
	.int 0, 0, 1, 0, 2, 3, 0, 0
	.int 0, 0, 1, 0, 2, 3, 0, 0
	.int 0, 1, 2, 0, 3, 4, 0, 0
	.int 0, 0, 0, 0, 1, 2, 0, 0
	.int 0, 0, 0, 1, 2, 3, 0, 0
	.int 0, 0, 0, 1, 2, 3, 0, 0
	.int 0, 1, 0, 2, 3, 4, 0, 0
	.int 0, 0, 0, 1, 2, 3, 0, 0
	.int 0, 0, 1, 2, 3, 4, 0, 0
	.int 0, 0, 1, 2, 3, 4, 0, 0
	.int 0, 1, 2, 3, 4, 5, 0, 0
	.int 0, 0, 0, 0, 0, 0, 0, 0
	.int 0, 0, 0, 0, 0, 0, 1, 0
	.int 0, 0, 0, 0, 0, 0, 1, 0
	.int 0, 1, 0, 0, 0, 0, 2, 0
	.int 0, 0, 0, 0, 0, 0, 1, 0
	.int 0, 0, 1, 0, 0, 0, 2, 0
	.int 0, 0, 1, 0, 0, 0, 2, 0
	.int 0, 1, 2, 0, 0, 0, 3, 0
	.int 0, 0, 0, 0, 0, 0, 1, 0
	.int 0, 0, 0, 1, 0, 0, 2, 0
	.int 0, 0, 0, 1, 0, 0, 2, 0
	.int 0, 1, 0, 2, 0, 0, 3, 0
	.int 0, 0, 0, 1, 0, 0, 2, 0
	.int 0, 0, 1, 2, 0, 0, 3, 0
	.int 0, 0, 1, 2, 0, 0, 3, 0
	.int 0, 1, 2, 3, 0, 0, 4, 0
	.int 0, 0, 0, 0, 0, 0, 1, 0
	.int 0, 0, 0, 0, 1, 0, 2, 0
	.int 0, 0, 0, 0, 1, 0, 2, 0
	.int 0, 1, 0, 0, 2, 0, 3, 0
	.int 0, 0, 0, 0, 1, 0, 2, 0
	.int 0, 0, 1, 0, 2, 0, 3, 0
	.int 0, 0, 1, 0, 2, 0, 3, 0
	.int 0, 1, 2, 0, 3, 0, 4, 0
	.int 0, 0, 0, 0, 1, 0, 2, 0
	.int 0, 0, 0, 1, 2, 0, 3, 0
	.int 0, 0, 0, 1, 2, 0, 3, 0
	.int 0, 1, 0, 2, 3, 0, 4, 0
	.int 0, 0, 0, 1, 2, 0, 3, 0
	.int 0, 0, 1, 2, 3, 0, 4, 0
	.int 0, 0, 1, 2, 3, 0, 4, 0
	.int 0, 1, 2, 3, 4, 0, 5, 0
	.int 0, 0, 0, 0, 0, 0, 1, 0
	.int 0, 0, 0, 0, 0, 1, 2, 0
	.int 0, 0, 0, 0, 0, 1, 2, 0
	.int 0, 1, 0, 0, 0, 2, 3, 0
	.int 0, 0, 0, 0, 0, 1, 2, 0
	.int 0, 0, 1, 0, 0, 2, 3, 0
	.int 0, 0, 1, 0, 0, 2, 3, 0
	.int 0, 1, 2, 0, 0, 3, 4, 0
	.int 0, 0, 0, 0, 0, 1, 2, 0
	.int 0, 0, 0, 1, 0, 2, 3, 0
	.int 0, 0, 0, 1, 0, 2, 3, 0
	.int 0, 1, 0, 2, 0, 3, 4, 0
	.int 0, 0, 0, 1, 0, 2, 3, 0
	.int 0, 0, 1, 2, 0, 3, 4, 0
	.int 0, 0, 1, 2, 0, 3, 4, 0
	.int 0, 1, 2, 3, 0, 4, 5, 0
	.int 0, 0, 0, 0, 0, 1, 2, 0
	.int 0, 0, 0, 0, 1, 2, 3, 0
	.int 0, 0, 0, 0, 1, 2, 3, 0
	.int 0, 1, 0, 0, 2, 3, 4, 0
	.int 0, 0, 0, 0, 1, 2, 3, 0
	.int 0, 0, 1, 0, 2, 3, 4, 0
	.int 0, 0, 1, 0, 2, 3, 4, 0
	.int 0, 1, 2, 0, 3, 4, 5, 0
	.int 0, 0, 0, 0, 1, 2, 3, 0
	.int 0, 0, 0, 1, 2, 3, 4, 0
	.int 0, 0, 0, 1, 2, 3, 4, 0
	.int 0, 1, 0, 2, 3, 4, 5, 0
	.int 0, 0, 0, 1, 2, 3, 4, 0
	.int 0, 0, 1, 2, 3, 4, 5, 0
	.int 0, 0, 1, 2, 3, 4, 5, 0
	.int 0, 1, 2, 3, 4, 5, 6, 0
	.int 0, 0, 0, 0, 0, 0, 0, 0
	.int 0, 0, 0, 0, 0, 0, 0, 1
	.int 0, 0, 0, 0, 0, 0, 0, 1
	.int 0, 1, 0, 0, 0, 0, 0, 2
	.int 0, 0, 0, 0, 0, 0, 0, 1
	.int 0, 0, 1, 0, 0, 0, 0, 2
	.int 0, 0, 1, 0, 0, 0, 0, 2
	.int 0, 1, 2, 0, 0, 0, 0, 3
	.int 0, 0, 0, 0, 0, 0, 0, 1
	.int 0, 0, 0, 1, 0, 0, 0, 2
	.int 0, 0, 0, 1, 0, 0, 0, 2
	.int 0, 1, 0, 2, 0, 0, 0, 3
	.int 0, 0, 0, 1, 0, 0, 0, 2
	.int 0, 0, 1, 2, 0, 0, 0, 3
	.int 0, 0, 1, 2, 0, 0, 0, 3
	.int 0, 1, 2, 3, 0, 0, 0, 4
	.int 0, 0, 0, 0, 0, 0, 0, 1
	.int 0, 0, 0, 0, 1, 0, 0, 2
	.int 0, 0, 0, 0, 1, 0, 0, 2
	.int 0, 1, 0, 0, 2, 0, 0, 3
	.int 0, 0, 0, 0, 1, 0, 0, 2
	.int 0, 0, 1, 0, 2, 0, 0, 3
	.int 0, 0, 1, 0, 2, 0, 0, 3
	.int 0, 1, 2, 0, 3, 0, 0, 4
	.int 0, 0, 0, 0, 1, 0, 0, 2
	.int 0, 0, 0, 1, 2, 0, 0, 3
	.int 0, 0, 0, 1, 2, 0, 0, 3
	.int 0, 1, 0, 2, 3, 0, 0, 4
	.int 0, 0, 0, 1, 2, 0, 0, 3
	.int 0, 0, 1, 2, 3, 0, 0, 4
	.int 0, 0, 1, 2, 3, 0, 0, 4
	.int 0, 1, 2, 3, 4, 0, 0, 5
	.int 0, 0, 0, 0, 0, 0, 0, 1
	.int 0, 0, 0, 0, 0, 1, 0, 2
	.int 0, 0, 0, 0, 0, 1, 0, 2
	.int 0, 1, 0, 0, 0, 2, 0, 3
	.int 0, 0, 0, 0, 0, 1, 0, 2
	.int 0, 0, 1, 0, 0, 2, 0, 3
	.int 0, 0, 1, 0, 0, 2, 0, 3
	.int 0, 1, 2, 0, 0, 3, 0, 4
	.int 0, 0, 0, 0, 0, 1, 0, 2
	.int 0, 0, 0, 1, 0, 2, 0, 3
	.int 0, 0, 0, 1, 0, 2, 0, 3
	.int 0, 1, 0, 2, 0, 3, 0, 4
	.int 0, 0, 0, 1, 0, 2, 0, 3
	.int 0, 0, 1, 2, 0, 3, 0, 4
	.int 0, 0, 1, 2, 0, 3, 0, 4
	.int 0, 1, 2, 3, 0, 4, 0, 5
	.int 0, 0, 0, 0, 0, 1, 0, 2
	.int 0, 0, 0, 0, 1, 2, 0, 3
	.int 0, 0, 0, 0, 1, 2, 0, 3
	.int 0, 1, 0, 0, 2, 3, 0, 4
	.int 0, 0, 0, 0, 1, 2, 0, 3
	.int 0, 0, 1, 0, 2, 3, 0, 4
	.int 0, 0, 1, 0, 2, 3, 0, 4
	.int 0, 1, 2, 0, 3, 4, 0, 5
	.int 0, 0, 0, 0, 1, 2, 0, 3
	.int 0, 0, 0, 1, 2, 3, 0, 4
	.int 0, 0, 0, 1, 2, 3, 0, 4
	.int 0, 1, 0, 2, 3, 4, 0, 5
	.int 0, 0, 0, 1, 2, 3, 0, 4
	.int 0, 0, 1, 2, 3, 4, 0, 5
	.int 0, 0, 1, 2, 3, 4, 0, 5
	.int 0, 1, 2, 3, 4, 5, 0, 6
	.int 0, 0, 0, 0, 0, 0, 0, 1
	.int 0, 0, 0, 0, 0, 0, 1, 2
	.int 0, 0, 0, 0, 0, 0, 1, 2
	.int 0, 1, 0, 0, 0, 0, 2, 3
	.int 0, 0, 0, 0, 0, 0, 1, 2
	.int 0, 0, 1, 0, 0, 0, 2, 3
	.int 0, 0, 1, 0, 0, 0, 2, 3
	.int 0, 1, 2, 0, 0, 0, 3, 4
	.int 0, 0, 0, 0, 0, 0, 1, 2
	.int 0, 0, 0, 1, 0, 0, 2, 3
	.int 0, 0, 0, 1, 0, 0, 2, 3
	.int 0, 1, 0, 2, 0, 0, 3, 4
	.int 0, 0, 0, 1, 0, 0, 2, 3
	.int 0, 0, 1, 2, 0, 0, 3, 4
	.int 0, 0, 1, 2, 0, 0, 3, 4
	.int 0, 1, 2, 3, 0, 0, 4, 5
	.int 0, 0, 0, 0, 0, 0, 1, 2
	.int 0, 0, 0, 0, 1, 0, 2, 3
	.int 0, 0, 0, 0, 1, 0, 2, 3
	.int 0, 1, 0, 0, 2, 0, 3, 4
	.int 0, 0, 0, 0, 1, 0, 2, 3
	.int 0, 0, 1, 0, 2, 0, 3, 4
	.int 0, 0, 1, 0, 2, 0, 3, 4
	.int 0, 1, 2, 0, 3, 0, 4, 5
	.int 0, 0, 0, 0, 1, 0, 2, 3
	.int 0, 0, 0, 1, 2, 0, 3, 4
	.int 0, 0, 0, 1, 2, 0, 3, 4
	.int 0, 1, 0, 2, 3, 0, 4, 5
	.int 0, 0, 0, 1, 2, 0, 3, 4
	.int 0, 0, 1, 2, 3, 0, 4, 5
	.int 0, 0, 1, 2, 3, 0, 4, 5
	.int 0, 1, 2, 3, 4, 0, 5, 6
	.int 0, 0, 0, 0, 0, 0, 1, 2
	.int 0, 0, 0, 0, 0, 1, 2, 3
	.int 0, 0, 0, 0, 0, 1, 2, 3
	.int 0, 1, 0, 0, 0, 2, 3, 4
	.int 0, 0, 0, 0, 0, 1, 2, 3
	.int 0, 0, 1, 0, 0, 2, 3, 4
	.int 0, 0, 1, 0, 0, 2, 3, 4
	.int 0, 1, 2, 0, 0, 3, 4, 5
	.int 0, 0, 0, 0, 0, 1, 2, 3
	.int 0, 0, 0, 1, 0, 2, 3, 4
	.int 0, 0, 0, 1, 0, 2, 3, 4
	.int 0, 1, 0, 2, 0, 3, 4, 5
	.int 0, 0, 0, 1, 0, 2, 3, 4
	.int 0, 0, 1, 2, 0, 3, 4, 5
	.int 0, 0, 1, 2, 0, 3, 4, 5
	.int 0, 1, 2, 3, 0, 4, 5, 6
	.int 0, 0, 0, 0, 0, 1, 2, 3
	.int 0, 0, 0, 0, 1, 2, 3, 4
	.int 0, 0, 0, 0, 1, 2, 3, 4
	.int 0, 1, 0, 0, 2, 3, 4, 5
	.int 0, 0, 0, 0, 1, 2, 3, 4
	.int 0, 0, 1, 0, 2, 3, 4, 5
	.int 0, 0, 1, 0, 2, 3, 4, 5
	.int 0, 1, 2, 0, 3, 4, 5, 6
	.int 0, 0, 0, 0, 1, 2, 3, 4
	.int 0, 0, 0, 1, 2, 3, 4, 5
	.int 0, 0, 0, 1, 2, 3, 4, 5
	.int 0, 1, 0, 2, 3, 4, 5, 6
	.int 0, 0, 0, 1, 2, 3, 4, 5
	.int 0, 0, 1, 2, 3, 4, 5, 6
	.int 0, 0, 1, 2, 3, 4, 5, 6
	.int 0, 1, 2, 3, 4, 5, 6, 7

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

